File set-up

Set working directory to current directory

if (rstudioapi::isAvailable()) {
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}

Load standard libraries and resolve conflicts

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(conflicted)
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.
conflict_prefer("select", "dplyr")
## [conflicted] Will prefer dplyr::select over any other package.
conflict_prefer("slice", "dplyr")
## [conflicted] Will prefer dplyr::slice over any other package.
conflict_prefer("rename", "dplyr")
## [conflicted] Will prefer dplyr::rename over any other package.
conflict_prefer('intersect', 'dplyr')
## [conflicted] Will prefer dplyr::intersect over any other package.

Read data

all_circ = read_tsv("../data/Supplementary_Table_2_all_circRNAs.txt")
## Rows: 1137099 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (16): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group...
## dbl  (7): start, end, BSJ_count, n_detected, n_db, estim_len_in, BSJ_count_m...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_circ
cq = read_tsv('../data/Supplementary_Table_3_selected_circRNAs.txt')
## Rows: 1560 Columns: 48
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (27): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (21): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sens = read_tsv('../data/Supplementary_Table_6B_sensitivity_values.txt')
## Rows: 32 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group_median
## dbl (6): nr_detected, nr_expected, sensitivity, perc_compound_val, total_n, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

How many transcripts have circRNAs?

ENST

all_circ %>%
  filter(!is.na(ENST), !ENST == 'ambiguous') %>%
  select(ENST) %>% unique() %>%
  nrow()
## [1] 17461
# NA or ambiguous => strand needs to be taken into account
all_circ %>% select(circ_id_strand, ENST) %>% unique() %>%
  filter(is.na(ENST) | ENST == 'ambiguous') %>%
  count(ENST)
# total nr

# ensembl = read_tsv('/Users/mivromma/Documents/PhD/indexes/Homo_sapiens.GRCh38.103.gtf',
#                    col_names = c('chr', 'havana', 'type', 'start', 'end', 'dot', 'strand', 'dot2', 'info'))
# 
# ensembl %>% filter(type == "transcript") %>%
#   separate(info, into = c('gene_id', 'gene_version', 'transcrip_id', 'rest'), 
#            extra = 'merge', sep = ';') %>%
#   select(transcrip_id) %>% unique()

17461/234393
## [1] 0.07449455
# can = read_tsv('known_exons_GRCh38.103_canonical.bed', col_names = c('chr', 'start', 'end', 'exon', 'dot', 'strand'))
# 
# can %>% separate(exon, into = c('ENST', 'ENST_nr', 'exon', 'exon_nr'), sep = '_') %>%
#   select(ENST) %>% unique() %>%
#   inner_join(all_circ %>% select(ENST) %>% unique())

17461/60427
## [1] 0.2889602

CircRNA length stats

All circRNAs

with introns

all_circ %>% select(chr, start, end, estim_len_in) %>% unique() %>%
  pull(estim_len_in) %>% quantile()
##     0%    25%    50%    75%   100% 
##     31    433   1428   8391 999911

no introns

all_circ %>% select(chr, start, end, estim_len_ex) %>% unique() %>%
  filter(!(is.na(estim_len_ex) | estim_len_ex == 'ambiguous')) %>%
  mutate(estim_len_ex = as.numeric(estim_len_ex)) %>%
  pull(estim_len_ex) %>% quantile()
##    0%   25%   50%   75%  100% 
##     1   275   485   900 37221

CircRNAs that don’t get validated with RNase R

do not get validated (no introns)

cq %>% filter(qPCR_val == 'pass',
              RR_val == 'fail',
              amp_seq_val == 'pass') %>%
  select(chr, start, end, estim_len_ex) %>% unique() %>%
  filter(!(is.na(estim_len_ex) | estim_len_ex == 'ambiguous')) %>%
  mutate(estim_len_ex = as.numeric(estim_len_ex)) %>%
  pull(estim_len_ex) %>% quantile()
##      0%     25%     50%     75%    100% 
##  115.00 2109.75 2381.50 3483.00 4758.00

BSJ count distribution

all_circ %>% 
  filter(!tool == "Sailfish-cir") %>%
  pull(BSJ_count) %>%
  quantile()
##   0%  25%  50%  75% 100% 
##    1    1    1    3 4007

how many with BSJ < 5

all_circ %>% 
  filter(!tool == "Sailfish-cir") %>%
  count(count_group)
round(945201/(945201+146700), 3)
## [1] 0.866

how many with BSJ >= 2

all_circ %>% 
  filter(!tool == "Sailfish-cir") %>%
  filter(BSJ_count >= 2) %>% nrow()
## [1] 503237
round(503237/(945201+146700), 3)
## [1] 0.461

How many circRNAs are detected on different strands

all_circ %>% 
  filter(!strand == 'unknown') %>%
  select(chr, start, end) %>% unique() %>%
  nrow()
## [1] 156433
all_circ %>% 
  filter(!strand == 'unknown') %>%
  select(chr, start, end, strand) %>% unique() %>%
  nrow()
## [1] 172744
round((172744-156433)/156433, 3)
## [1] 0.104

CircRNA numbers

nr of circ detected per tool

all_circ %>%
  filter(cell_line == 'HLF') %>%
  group_by(tool) %>%
  count() %>%
  arrange(n)

nr of unique circ

all_circ %>% select(circ_id) %>%
  unique()

total nr of circ

nrow(all_circ)
## [1] 1137099